########################
##R-script for Figure A2 and A3
##Who is mobilized to vote by short text messages? Evidence from a nationwide field experiment with young voters
########################
#Edited 24/5/5

setwd("W:/dofiles/replication")
#load package needed for logging
library(logr)

log_open("Replication_log_R.log")


#load packages needed for importing stata data, analysis and graphs
library(foreign)
library(grf)
library(rpart)
library(glmnet)
library(splines)
library(MASS)
library(lmtest)
library(sandwich)
library(ggplot2)

##############
##Figure A2
##############

#set seed
set.seed(2311)
#read stata format data
data <- read.dta("W:/data/foranalysisS12.dta")
n <- nrow(data)
#specify variables
treatment <- "treated"
outcome <- "voted22"
covariates <-c("female", "age", "highschool", "immigrant", "rural")

#preparing data to fir the model
fmla <- formula(paste0("~ 0 + ", paste0(covariates, collapse="+")))
X <- model.matrix(fmla, data)
W <- data[,treatment]
Y <- data[,outcome]

#set number of rankng groups and folds
num.rankings <- 3
num.folds <- 10
folds <- sort(seq(n) %% num.folds) + 1
#set proportion of treated obs
forest <- causal_forest(X, Y, W, W.hat=.5996, clusters= folds)
#retrieve predictions
tau.hat <- predict(forest)$predictions

#rank obs within each fold into groups according to their CATE predictions
ranking <- rep(NA, n)
for (fold in seq(num.folds)) {
  tau.hat.quantiles <-quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
  ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE,labels=seq(num.rankings))
}

#average difference in means within each rank
fmla <- paste0(outcome, " ~ 0 + ranking + ranking:", treatment)
ols.ate <- lm(fmla, data=transform(data, ranking=factor(ranking)))
ols.ate <- coeftest(ols.ate, vcov=vcovHC(ols.ate, type='HC2'))
interact <- which(grepl(":", rownames(ols.ate)))
ols.ate <-data.frame("ols", paste0("Q", seq(num.rankings)), ols.ate[interact, 1:2])
rownames(ols.ate) <- NULL
colnames (ols.ate) <- c("method", "ranking", "estimate", "std.err")
ols.ate

#plot average CATE within each group with standard errors
ggplot(ols.ate)+
  aes(x = ranking, y = estimate) +
  geom_point(position=position_dodge(0.2)) +
  geom_errorbar(aes(ymin=estimate-1.96*std.err, ymax=estimate+1.96*std.err), width=.2, position=position_dodge(0.2)) +
  ylab("") + xlab("") +
  ggtitle("Average CATE within each ranking") +
  theme_minimal() + 
  theme(legend.position="bottom", legend.title = element_blank())

ggsave("W:/output/figures/quintilestmp3.png")


df <- mapply(function(covariate) {
  #compute mean covariate value per ranking with standard errors for each covariate
  fmla <- formula(paste0(covariate, "~ 0 + ranking"))
  ols <- lm(fmla, data=transform(data, ranking=factor(ranking)))
  ols.res <- coeftest(ols, vcov=vcovHC(ols, "HC2"))
  
  #retrieve results
  avg <- ols.res[,1]
  stderr <- ols.res[,2]
  #tally up results
  data.frame(covariate, avg, stderr, ranking=paste0("Q", seq(num.rankings)),
             #scaling for coloring
             scaling=pnorm((avg-mean(avg))/sd(avg)),
             #ordering
             variation=sd(avg) / sd(data[, covariate]),
             #strings for heatmap
             labels=paste0(signif(avg, 3), "\n", "(", signif(stderr, 3), ")"))
  
}, covariates, SIMPLIFY = FALSE)
df <- do.call(rbind, df)
#for heatmap tp be in decreasing order
df$covariate <- reorder(df$covariate, order(df$variation))



#plot heatmap
ggplot(df) +
  aes(ranking, covariate) +
  geom_tile(aes(fill = scaling)) +
  geom_text(aes(label = labels)) +
  scale_fill_gradient(low = "#E1BE6A", high = "#40B0A6") +
  ggtitle(paste0("Average covariate values within group")) +
  theme_minimal() +
  ylab("") + xlab("CATE estimate ranking") +
  theme(plot.title = element_text(size = 11, face = "bold"),
        axis.text=element_text(size=11))
ggsave("W:/output/figures/hmaptmp3.png")








##############
##Figure A3
##############

#clear
rm(list=ls())
#set seed
set.seed(2315)
#read stata format data
data <- read.dta("W:/data/foranalysisS12spillover.dta")
n <- nrow(data)
#specify variables
treatment <- "treatedf"
outcome <- "voted22"
covariates <-c("female", "age", "highschool", "immigrant", "rural")

#preparing data to fir the model
fmla <- formula(paste0("~ 0 + ", paste0(covariates, collapse="+")))
X <- model.matrix(fmla, data)
W <- data[,treatment]
Y <- data[,outcome]

#set number of rankng groups and folds
num.rankings <- 3
num.folds <- 10
folds <- sort(seq(n) %% num.folds) + 1
#set proportion of treated obs
forest <- causal_forest(X, Y, W, W.hat=.6028, clusters= folds)
#retrieve predictions
tau.hat <- predict(forest)$predictions

#rank obs within each fold into groups according to their CATE predictions
ranking <- rep(NA, n)
for (fold in seq(num.folds)) {
  tau.hat.quantiles <-quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
  ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE,labels=seq(num.rankings))
}

#average difference in means within each rank
fmla <- paste0(outcome, " ~ 0 + ranking + ranking:", treatment)
ols.ate <- lm(fmla, data=transform(data, ranking=factor(ranking)))
ols.ate <- coeftest(ols.ate, vcov=vcovHC(ols.ate, type='HC2'))
interact <- which(grepl(":", rownames(ols.ate)))
ols.ate <-data.frame("ols", paste0("Q", seq(num.rankings)), ols.ate[interact, 1:2])
rownames(ols.ate) <- NULL
colnames (ols.ate) <- c("method", "ranking", "estimate", "std.err")
ols.ate

#plot average CATE within each group with standard errors
ggplot(ols.ate)+
  aes(x = ranking, y = estimate) +
  geom_point(position=position_dodge(0.2)) +
  geom_errorbar(aes(ymin=estimate-1.96*std.err, ymax=estimate+1.96*std.err), width=.2, position=position_dodge(0.2)) +
  ylab("") + xlab("") +
  ggtitle("Average CATE within each ranking") +
  theme_minimal() + 
  theme(legend.position="bottom", legend.title = element_blank())

ggsave("W:/output/figures/spillover/quintilestmp3.png")


df <- mapply(function(covariate) {
  #compute mean covariate value per ranking with standard errors for each covariate
  fmla <- formula(paste0(covariate, "~ 0 + ranking"))
  ols <- lm(fmla, data=transform(data, ranking=factor(ranking)))
  ols.res <- coeftest(ols, vcov=vcovHC(ols, "HC2"))
  
  #retrieve results
  avg <- ols.res[,1]
  stderr <- ols.res[,2]
  #tally up results
  data.frame(covariate, avg, stderr, ranking=paste0("Q", seq(num.rankings)),
             #scaling for coloring
             scaling=pnorm((avg-mean(avg))/sd(avg)),
             #ordering
             variation=sd(avg) / sd(data[, covariate]),
             #strings for heatmap
             labels=paste0(signif(avg, 3), "\n", "(", signif(stderr, 3), ")"))
  
}, covariates, SIMPLIFY = FALSE)
df <- do.call(rbind, df)
#for heatmap tp be in decreasing order
df$covariate <- reorder(df$covariate, order(df$variation))



#plot heatmap
ggplot(df) +
  aes(ranking, covariate) +
  geom_tile(aes(fill = scaling)) +
  geom_text(aes(label = labels)) +
  scale_fill_gradient(low = "#E1BE6A", high = "#40B0A6") +
  ggtitle(paste0("Average covariate values within group")) +
  theme_minimal() +
  ylab("") + xlab("CATE estimate ranking") +
  theme(plot.title = element_text(size = 11, face = "bold"),
        axis.text=element_text(size=11))
ggsave("W:/output/figures/spillover/hmaptmp3.png")


























log_close()